/*
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 2.0
//              Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/

#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
#define KOKKOS_IMPL_CUDA_TASK_HPP

#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_TASKDAG )

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

#include <Kokkos_Core_fwd.hpp>

#include <impl/Kokkos_TaskBase.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp> // CUDA_SAFE_CALL
#include <impl/Kokkos_TaskTeamMember.hpp>

//----------------------------------------------------------------------------

namespace Kokkos {
namespace Impl {
namespace {

template< typename TaskType >
__global__
void set_cuda_task_base_apply_function_pointer
  ( typename TaskType::function_type * ptr, typename TaskType::destroy_type* dtor )
{ 
  *ptr = TaskType::apply;
  *dtor = TaskType::destroy;
}

template< typename Scheduler >
__global__
void cuda_task_queue_execute( Scheduler scheduler, int32_t shmem_size ) {
  TaskQueueSpecialization< Scheduler >::driver( std::move(scheduler) , shmem_size );
}

}

template <class, class> class TaskExec ;

template<class QueueType>
class TaskQueueSpecialization<
  SimpleTaskScheduler<Kokkos::Cuda, QueueType>
>
{
public:

  using scheduler_type = SimpleTaskScheduler<Kokkos::Cuda, QueueType>;
  using execution_space = Kokkos::Cuda;
  using memory_space = Kokkos::CudaUVMSpace;
  using member_type = TaskExec<Kokkos::Cuda, scheduler_type> ;

  enum : long { max_league_size = 16 };
  enum : int { warps_per_block = 4 };

  KOKKOS_INLINE_FUNCTION
  static
  void iff_single_thread_recursive_execute( scheduler_type const& ) {}

  static int get_max_team_count(
    execution_space const&
  ) {
    return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block;
  }

  __device__
  static void driver(scheduler_type scheduler, int32_t shmem_per_warp)
  {
    using queue_type = typename scheduler_type::task_queue_type;
    using task_base_type = typename scheduler_type::task_base_type;
    using runnable_task_base_type = typename scheduler_type::runnable_task_base_type;
    using scheduling_info_storage_type =
      SchedulingInfoStorage<
        runnable_task_base_type,
        typename scheduler_type::task_scheduling_info_type
      >;

    extern __shared__ int32_t shmem_all[];

    int32_t* const warp_shmem = shmem_all + (threadIdx.z * shmem_per_warp) / sizeof(int32_t);

    task_base_type* const shared_memory_task_copy = (task_base_type*)warp_shmem;

    const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x;

    member_type single_exec(scheduler, warp_shmem, 1);
    member_type team_exec(scheduler, warp_shmem, blockDim.y);

    auto& queue = scheduler.queue();
    auto& team_scheduler = team_exec.scheduler();

    auto current_task = OptionalRef<task_base_type>();

    // Loop until all queues are empty and no tasks in flight
    while(not queue.is_done()) {

      if(warp_lane == 0) {  // should be (?) same as team_exec.team_rank() == 0
        // pop off a task
        current_task = queue.pop_ready_task(team_scheduler.team_scheduler_info());
      }

      // Broadcast task pointer:

      // Sync before the broadcast
      KOKKOS_IMPL_CUDA_SYNCWARP;
      
      // pretend it's an int* for shuffle purposes
      ((int*) &current_task)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*) &current_task)[0], 0, 32);
      ((int*) &current_task)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*) &current_task)[1], 0, 32);

      if(current_task) {

        KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());

        int32_t b = sizeof(scheduling_info_storage_type) / sizeof(int32_t);
        static_assert(
          sizeof(scheduling_info_storage_type) % sizeof(int32_t) == 0,
          "bad task size"
        );
        int32_t const e = current_task->get_allocation_size() / sizeof(int32_t);
        KOKKOS_ASSERT(current_task->get_allocation_size() % sizeof(int32_t) == 0);

        int32_t volatile* const task_mem = (int32_t volatile*)current_task.get();

        // do a coordinated copy of the task closure from global to shared memory:
        for(int32_t i = warp_lane; i < e; i += CudaTraits::WarpSize) {
          warp_shmem[i] = task_mem[i];
        }

        // Synchronize threads of the warp and insure memory
        // writes are visible to all threads in the warp.
        KOKKOS_IMPL_CUDA_SYNCWARP;

        if(shared_memory_task_copy->is_team_runnable()) {
          // Thread Team Task
          shared_memory_task_copy->as_runnable_task().run(team_exec);
        }
        else if(threadIdx.y == 0) {
          // TODO @tasking @optimization DSH Change this to warp_lane == 0 when we allow blockDim.x to be more than 1
          // Single Thread Task
          shared_memory_task_copy->as_runnable_task().run(single_exec);
        }

        // Synchronize threads of the warp and insure memory
        // writes are visible to all threads in the warp.

        KOKKOS_IMPL_CUDA_SYNCWARP;

        //if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
        //b -= b % CudaTraits::WarpSize;

        // copy task closure from shared to global memory:
        for (int32_t i = b + warp_lane; i < e; i += CudaTraits::WarpSize) {
          task_mem[i] = warp_shmem[i];
        }

        // Synchronize threads of the warp and insure memory
        // writes are visible to root thread of the warp for
        // respawn or completion.

        KOKKOS_IMPL_CUDA_SYNCWARP;


        if (warp_lane == 0) {
          // If respawn requested copy respawn data back to main memory
          if(shared_memory_task_copy->as_runnable_task().get_respawn_flag()) {
            if(shared_memory_task_copy->as_runnable_task().has_predecessor()) {
              // It's not necessary to make this a volatile write because
              // the next read of the predecessor is on this thread in complete,
              // and the predecessor is cleared there (using a volatile write)
              current_task->as_runnable_task().acquire_predecessor_from(
                shared_memory_task_copy->as_runnable_task()
              );
            }

            // It may not necessary to make this a volatile write, since the
            // next read will be done by this thread in complete where the
            // rescheduling occurs, but since the task could be stolen later
            // before this is written again, we should do the volatile write
            // here.  (It might not be necessary though because I don't know
            // where else the priority would be read after it is scheduled
            // by this thread; for now, we leave it volatile, but we should
            // benchmark the cost of this.)
            current_task.as_volatile()->set_priority(shared_memory_task_copy->get_priority());

            // It's not necessary to make this a volatile write, since the
            // next read of it (if true) will be by this thread in `complete()`,
            // which will unset the flag (using volatile) once it has handled
            // the respawn
            current_task->as_runnable_task().set_respawn_flag();

          }

          queue.complete(
            (*std::move(current_task)).as_runnable_task(),
            team_scheduler.team_scheduler_info()
          );
        }

      }
    }
  }

  static
  void execute(scheduler_type const& scheduler)
  {
    const int shared_per_warp = 2048 ;
    const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1);
    const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block);
    const int shared_total = shared_per_warp * warps_per_block;
    const cudaStream_t stream = nullptr;

    KOKKOS_ASSERT(
      static_cast<long>(grid.x * grid.y * grid.z * block.x * block.y * block.z)
        == static_cast<long>(get_max_team_count(scheduler.get_execution_space()) * Kokkos::Impl::CudaTraits::WarpSize)
    );

    auto& queue = scheduler.queue();

    CUDA_SAFE_CALL(cudaDeviceSynchronize());

    // Query the stack size, in bytes:

    size_t previous_stack_size = 0;
    CUDA_SAFE_CALL(cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));

    // If not large enough then set the stack size, in bytes:

    const size_t larger_stack_size = 1 << 11;

    if (previous_stack_size < larger_stack_size) {
      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
    }

    cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(scheduler, shared_per_warp);

    CUDA_SAFE_CALL(cudaGetLastError());

    CUDA_SAFE_CALL(cudaDeviceSynchronize());

    if (previous_stack_size < larger_stack_size) {
      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
    }
  }

  template <typename TaskType>
  static
  // TODO @tasking @optimiazation DSH specialize this for trivially destructible types
  void
  get_function_pointer(
    typename TaskType::function_type& ptr,
    typename TaskType::destroy_type& dtor
  )
  {
    using function_type = typename TaskType::function_type;
    using destroy_type = typename TaskType::destroy_type;

    // TODO @tasking @minor DSH make sure there aren't any alignment concerns?
    void* storage = cuda_internal_scratch_unified( 
      Kokkos::Cuda(),
      sizeof(function_type) + sizeof(destroy_type)
    );
    function_type* ptr_ptr = (function_type*)storage;
    destroy_type* dtor_ptr = (destroy_type*)((char*)storage + sizeof(function_type));

    CUDA_SAFE_CALL( cudaDeviceSynchronize() );

    set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr_ptr, dtor_ptr);

    CUDA_SAFE_CALL( cudaGetLastError() );
    CUDA_SAFE_CALL( cudaDeviceSynchronize() );

    ptr = *ptr_ptr;
    dtor = *dtor_ptr;
  }
};

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

template<class Scheduler>
class TaskQueueSpecializationConstrained<
  Scheduler,
  typename std::enable_if<
    std::is_same<typename Scheduler::execution_space, Kokkos::Cuda>::value
  >::type
>
{
public:

  using scheduler_type = Scheduler;
  using execution_space = Kokkos::Cuda;
  using memory_space = Kokkos::CudaUVMSpace;
  using member_type = TaskExec<Kokkos::Cuda, Scheduler> ;

  enum : long { max_league_size = 16 };

  KOKKOS_INLINE_FUNCTION
  static
  void iff_single_thread_recursive_execute( scheduler_type const& ) {}

  __device__
  static void driver(scheduler_type scheduler, int32_t shmem_per_warp)
  {
    using queue_type = typename scheduler_type::queue_type;
    using task_root_type = TaskBase;

    extern __shared__ int32_t shmem_all[];

    task_root_type* const end = (task_root_type *) task_root_type::EndTag ;
    task_root_type* const no_more_tasks_sentinel = nullptr;

    int32_t * const warp_shmem =
      shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);

    task_root_type * const task_shmem = (task_root_type *) warp_shmem ;

    const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;

    member_type single_exec(scheduler, warp_shmem, 1);
    member_type team_exec(scheduler, warp_shmem, blockDim.y);

    auto& team_queue = team_exec.scheduler().queue();

    task_root_type * task_ptr = no_more_tasks_sentinel;

    // Loop until all queues are empty and no tasks in flight

    do {

      // Each team lead attempts to acquire either a thread team task
      // or collection of single thread tasks for the team.

      if ( 0 == warp_lane ) {

        if( *((volatile int *) & team_queue.m_ready_count) > 0 ) {
          task_ptr = end;
          // Attempt to acquire a task
          // Loop by priority and then type
          for ( int i = 0 ; i < queue_type::NumQueue && end == task_ptr ; ++i ) {
            for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
              task_ptr = queue_type::pop_ready_task( & team_queue.m_ready[i][j] );
            }
          }
        }
        else {
          // returns nullptr if and only if all other queues have a ready
          // count of 0 also. Otherwise, returns a task from another queue
          // or `end` if one couldn't be popped
          task_ptr = team_queue.attempt_to_steal_task();
          #if 0
          if(task != no_more_tasks_sentinel && task != end) {
            std::printf("task stolen on rank %d\n", team_exec.league_rank());
          }
          #endif
        }

      }

      // Synchronize warp with memory fence before broadcasting task pointer:

      // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
      KOKKOS_IMPL_CUDA_SYNCWARP ;

      // Broadcast task pointer:

      ((int*) & task_ptr )[0] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[0] , 0 , 32 );
      ((int*) & task_ptr )[1] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[1] , 0 , 32 );

      #if defined( KOKKOS_DEBUG )
      KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "TaskQueue CUDA task_ptr" );
      #endif

      if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count

      if ( end != task_ptr ) {

        // Whole warp copy task's closure to/from shared memory.
        // Use all threads of warp for coalesced read/write.

        int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
        int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);

        int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;

        KOKKOS_ASSERT(e * sizeof(int32_t) < shmem_per_warp);

        // copy task closure from global to shared memory:

        for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
          warp_shmem[i] = task_mem[i] ;
        }

        // Synchronize threads of the warp and insure memory
        // writes are visible to all threads in the warp.

        // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
        KOKKOS_IMPL_CUDA_SYNCWARP ;

        if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
          // Thread Team Task
          (*task_shmem->m_apply)( task_shmem , & team_exec );
        }
        else if ( 0 == threadIdx.y ) {
          // Single Thread Task
          (*task_shmem->m_apply)( task_shmem , & single_exec );
        }

        // Synchronize threads of the warp and insure memory
        // writes are visible to all threads in the warp.

        // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
        KOKKOS_IMPL_CUDA_SYNCWARP ;

        // copy task closure from shared to global memory:

        for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
          task_mem[i] = warp_shmem[i] ;
        }

        // Synchronize threads of the warp and insure memory
        // writes are visible to root thread of the warp for
        // respawn or completion.

        // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
        KOKKOS_IMPL_CUDA_SYNCWARP ;

        // If respawn requested copy respawn data back to main memory

        if ( 0 == warp_lane ) {

          if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
            ( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
            ( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
          }

          team_queue.complete( task_ptr );
        }

      }
    } while(1);
  }

  static
  void execute(scheduler_type const& scheduler)
  {
    const int shared_per_warp = 2048 ;
    const int warps_per_block = 4 ;
    const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
    //const dim3 grid( 1 , 1 , 1 );
    const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
    const int shared_total = shared_per_warp * warps_per_block ;
    const cudaStream_t stream = 0 ;

    auto& queue = scheduler.queue();
    queue.initialize_team_queues(warps_per_block * grid.x);

    CUDA_SAFE_CALL( cudaDeviceSynchronize() );

    // Query the stack size, in bytes:

    size_t previous_stack_size = 0 ;
    CUDA_SAFE_CALL( cudaDeviceGetLimit( & previous_stack_size , cudaLimitStackSize ) );

    // If not large enough then set the stack size, in bytes:

    const size_t larger_stack_size = 2048 ;

    if ( previous_stack_size < larger_stack_size ) {
      CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , larger_stack_size ) );
    }

    cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( scheduler , shared_per_warp );

    CUDA_SAFE_CALL( cudaGetLastError() );

    CUDA_SAFE_CALL( cudaDeviceSynchronize() );

    if ( previous_stack_size < larger_stack_size ) {
      CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , previous_stack_size ) );
    }

  }

  template< typename TaskType >
  static
  void
  get_function_pointer(
    typename TaskType::function_type& ptr,
    typename TaskType::destroy_type& dtor
  )
    {
      using function_type = typename TaskType::function_type;
      using destroy_type = typename TaskType::destroy_type;

      void* storage = cuda_internal_scratch_unified(
        Kokkos::Cuda(),
        sizeof(function_type) + sizeof(destroy_type)
      );
      function_type* ptr_ptr = (function_type*)storage;
      destroy_type* dtor_ptr = (destroy_type*)((char*)storage + sizeof(function_type));

      CUDA_SAFE_CALL( cudaDeviceSynchronize() );

      set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr_ptr, dtor_ptr);

      CUDA_SAFE_CALL( cudaGetLastError() );
      CUDA_SAFE_CALL( cudaDeviceSynchronize() );

      ptr = *ptr_ptr;
      dtor = *dtor_ptr;

    }
};

extern template class TaskQueue< Kokkos::Cuda, default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;

}} /* namespace Kokkos::Impl */

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

namespace Kokkos {
namespace Impl {

/**\brief  Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
 *         passed to tasks running in a Cuda space.
 *
 *  Cuda thread blocks for tasking are dimensioned:
 *    blockDim.x == vector length
 *    blockDim.y == team size
 *    blockDim.z == number of teams
 *  where
 *    blockDim.x * blockDim.y == WarpSize
 *
 *  Current implementation requires blockDim.x == 1.
 *  Vector level parallelism with blockDim.y > 1 on Volta will
 *  require a vector-level synchronization mask for vector-level
 *  collective operaitons.
 *
 *  Both single thread and thread team tasks are run by a full Cuda warp.
 *  A single thread task is called by warp lane #0 and the remaining
 *  lanes of the warp are idle.
 *
 *  When executing a single thread task the syncwarp or other
 *  warp synchronizing functions must not be called.
 */
template <class Scheduler>
class TaskExec<Kokkos::Cuda, Scheduler>
{
private:

  enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize };

  TaskExec( TaskExec && ) = delete ;
  TaskExec( TaskExec const & ) = delete ;
  TaskExec & operator = ( TaskExec && ) = delete ;
  TaskExec & operator = ( TaskExec const & ) = delete ;

  friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda, default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> > ;
  template <class, class>
  friend class Kokkos::Impl::TaskQueueSpecializationConstrained;
  template <class>
  friend class Kokkos::Impl::TaskQueueSpecialization;

  int32_t * m_team_shmem ;
  const int m_team_size ;
  Scheduler m_scheduler;

  // If constructed with arg_team_size == 1 the object
  // can only be used by 0 == threadIdx.y.
  KOKKOS_INLINE_FUNCTION
  TaskExec(
    Scheduler const& parent_scheduler,
    int32_t* arg_team_shmem,
    int arg_team_size = blockDim.y
  )
    : m_team_shmem(arg_team_shmem),
      m_team_size(arg_team_size),
      m_scheduler(parent_scheduler.get_team_scheduler(league_rank()))
  { }

public:

  using thread_team_member = TaskExec;

#if defined( __CUDA_ARCH__ )
  __device__ int team_rank() const { return threadIdx.y ; }
  __device__ int team_size() const { return m_team_size ; }
  //__device__ int league_rank() const { return threadIdx.z; }
  __device__ int league_rank() const { return blockIdx.x * blockDim.z + threadIdx.z; }
  __device__ int league_size() const { return blockDim.z * gridDim.x; }

  __device__ void team_barrier() const
    {
      if ( 1 < m_team_size ) {
        KOKKOS_IMPL_CUDA_SYNCWARP ;
      }
    }

  template< class ValueType >
  __device__ void team_broadcast( ValueType & val , const int thread_id ) const
    {
      if ( 1 < m_team_size ) {
        // WarpSize = blockDim.X * blockDim.y
        // thread_id < blockDim.y
        ValueType tmp( val ); // input might not be register variable
        cuda_shfl( val, tmp, blockDim.x * thread_id, WarpSize );
      }
    }

#else
  __host__ int team_rank() const { return 0 ; }
  __host__ int team_size() const { return 0 ; }
  __host__ int league_rank() const { return 0; }
  __host__ int league_size() const { return 0; }
  __host__ void team_barrier() const {}
  template< class ValueType >
  __host__ void team_broadcast( ValueType & , const int ) const {}
#endif

  KOKKOS_INLINE_FUNCTION Scheduler const& scheduler() const noexcept { return m_scheduler; }
  KOKKOS_INLINE_FUNCTION Scheduler& scheduler() noexcept { return m_scheduler; }

};

}} /* namespace Kokkos::Impl */

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

namespace Kokkos {
namespace Impl {

template<typename iType, typename Scheduler>
struct TeamThreadRangeBoundariesStruct<iType, TaskExec<Kokkos::Cuda, Scheduler>>
{
  using index_type = iType;
  using member_type = TaskExec<Kokkos::Cuda, Scheduler>;

  const iType start ;
  const iType end ;
  const iType increment ;
  member_type const& thread;

#if defined( __CUDA_ARCH__ )

  __device__ inline
  TeamThreadRangeBoundariesStruct
    ( member_type const& arg_thread, const iType& arg_count)
    : start( threadIdx.y )
    , end(arg_count)
    , increment( blockDim.y )
    , thread(arg_thread)
    {}

  __device__ inline
  TeamThreadRangeBoundariesStruct
    ( member_type const& arg_thread
    , const iType & arg_start
    , const iType & arg_end
    )
    : start( arg_start + threadIdx.y )
    , end(   arg_end)
    , increment( blockDim.y )
    , thread( arg_thread )
    {}

#else

  TeamThreadRangeBoundariesStruct
    ( member_type const& arg_thread, const iType& arg_count);

  TeamThreadRangeBoundariesStruct
    ( member_type const& arg_thread
    , const iType & arg_start
    , const iType & arg_end
    );

#endif

};

//----------------------------------------------------------------------------

template<typename iType, typename Scheduler>
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda, Scheduler > >
{
  using index_type = iType;
  using member_type = TaskExec<Kokkos::Cuda, Scheduler>;

  const index_type start ;
  const index_type end ;
  const index_type increment ;
  const member_type& thread;

#if defined( __CUDA_ARCH__ )

  __device__ inline
  ThreadVectorRangeBoundariesStruct
    ( member_type const& arg_thread, const index_type& arg_count )
    : start( threadIdx.x )
    , end(arg_count)
    , increment( blockDim.x )
    , thread(arg_thread)
    {}

  __device__ inline
  ThreadVectorRangeBoundariesStruct
    ( member_type const& arg_thread, const index_type& arg_begin, const index_type& arg_end )
    : start( arg_begin + threadIdx.x )
    , end(arg_end)
    , increment( blockDim.x )
    , thread(arg_thread)
    {}

#else

  ThreadVectorRangeBoundariesStruct
    ( member_type const& arg_thread, const index_type& arg_count );

  ThreadVectorRangeBoundariesStruct
    ( member_type const& arg_thread, const index_type& arg_begin, const index_type& arg_end);

#endif

};

}} /* namespace Kokkos::Impl */

//----------------------------------------------------------------------------

namespace Kokkos {

//template<typename iType>
//KOKKOS_INLINE_FUNCTION
//Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >
//TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count )
//{
//  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count );
//}
//
//template<typename iType1, typename iType2>
//KOKKOS_INLINE_FUNCTION
//Impl::TeamThreadRangeBoundariesStruct
//  < typename std::common_type<iType1,iType2>::type
//  , Impl::TaskExec< Kokkos::Cuda > >
//TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
//               , const iType1 & begin, const iType2 & end )
//{
//  typedef typename std::common_type< iType1, iType2 >::type iType;
//  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >(
//           thread, iType(begin), iType(end) );
//}
//
//template<typename iType>
//KOKKOS_INLINE_FUNCTION
//Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
//ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
//                 , const iType & count )
//{
//  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
//}
//
//template<typename iType>
//KOKKOS_INLINE_FUNCTION
//Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
//ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
//                 , const iType & arg_begin
//                 , const iType & arg_end )
//{
//  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,arg_begin,arg_end);
//}

// KOKKOS_INLINE_FUNCTION
// Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
// PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread)
// {
//   return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
// }

// KOKKOS_INLINE_FUNCTION
// Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
// PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread)
// {
//   return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
// }

/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
 *
 * The range i=0..N-1 is mapped to all threads of the the calling thread team.
 * This functionality requires C++11 support.
*/
template<typename iType, class Lambda, class Scheduler>
KOKKOS_INLINE_FUNCTION
void parallel_for
  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries
  , const Lambda& lambda
  )
{
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i);
  }
}

template< typename iType, class Lambda, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_for
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Lambda & lambda) {
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i);
  }
}

// reduce across corresponding lanes between team members within warp
// assume stride*team_size == warp_size
template< typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void strided_shfl_warp_reduction
  (const JoinType& join,
   ValueType& val,
   int team_size,
   int stride)
{
  for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
    join(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
  }
}

// multiple within-warp non-strided reductions
template< typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void multi_shfl_warp_reduction
  (const JoinType& join,
   ValueType& val,
   int vec_length)
{
  for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
    join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
  }
}

// broadcast within warp
template< class ValueType >
KOKKOS_INLINE_FUNCTION
ValueType shfl_warp_broadcast
  (ValueType& val,
   int src_lane,
   int width)
{
  if ( 1 < width ) {
    return Kokkos::shfl(val, src_lane, width);
  }
  else {
    return val ;
  }
}

/*// all-reduce across corresponding vector lanes between team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
   const Lambda & lambda,
   const JoinType& join,
   ValueType& initialized_result) {

  ValueType result = initialized_result;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }
  initialized_result = result;

  strided_shfl_warp_reduction<ValueType, JoinType>(
                          join,
                          initialized_result,
                          loop_boundaries.thread.team_size(),
                          blockDim.x);
  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
}*/

// all-reduce across corresponding vector lanes between team members within warp
// if no join() provided, use sum
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Lambda & lambda,
   ValueType& initialized_result) {

  //TODO @internal_documentation what is the point of creating this temporary?
  ValueType result = initialized_result;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }
  initialized_result = result;

  if ( 1 < loop_boundaries.thread.team_size() ) {

    strided_shfl_warp_reduction(
      [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
      initialized_result,
      loop_boundaries.thread.team_size(),
      blockDim.x);

    initialized_result =
      shfl_warp_broadcast<ValueType>(
        initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
  }
}

template< typename iType, class Lambda, typename ReducerType, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Lambda & lambda,
   const ReducerType& reducer) {

  typedef typename ReducerType::value_type ValueType;
  //TODO @internal_documentation what is the point of creating this temporary?
  ValueType result = ValueType();
  reducer.init(result);

  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }

  if ( 1 < loop_boundaries.thread.team_size() ) {
    strided_shfl_warp_reduction(
      [&] (ValueType& val1, const ValueType& val2) { reducer.join(val1,val2); },
      result,
      loop_boundaries.thread.team_size(),
      blockDim.x);

    reducer.reference() =
      shfl_warp_broadcast<ValueType>(
        result, threadIdx.x, Impl::CudaTraits::WarpSize );
  }
  else {
    reducer.reference() = result ;
  }
}
// all-reduce within team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
/*template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
   const Lambda & lambda,
   const JoinType& join,
   ValueType& initialized_result) {

  ValueType result = initialized_result;
  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }
  initialized_result = result;

  multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
}*/

// all-reduce within team members within warp
// if no join() provided, use sum
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Lambda & lambda,
   ValueType& initialized_result) {

  ValueType result = initialized_result;

  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }

  initialized_result = result;

  if ( 1 < loop_boundaries.thread.team_size() ) {
    //initialized_result = multi_shfl_warp_reduction(
    multi_shfl_warp_reduction(
      [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
      initialized_result,
      blockDim.x);

    initialized_result =
      shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
  }
}

template< typename iType, class Lambda, typename ReducerType, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Lambda & lambda,
   const ReducerType& reducer) {

  typedef typename ReducerType::value_type ValueType;

  ValueType result = ValueType();
  reducer.init(result);

  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
    lambda(i,result);
  }

  if ( 1 < loop_boundaries.thread.team_size() ) {
    multi_shfl_warp_reduction(
      [&] (ValueType& val1, const ValueType& val2) { reducer.join(val1,val2); },
      result,
      blockDim.x);

    reducer.reference() =
      shfl_warp_broadcast<ValueType>( result, 0, blockDim.x );
  }
  else {
    reducer.reference() = result ;
  }
}
// scan across corresponding vector lanes between team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Closure, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_scan
  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Closure & closure )
{
  // Extract value_type from closure

  using value_type =
    typename Kokkos::Impl::FunctorAnalysis
      < Kokkos::Impl::FunctorPatternInterface::SCAN
      , void
      , Closure >::value_type ;

  if ( 1 < loop_boundaries.thread.team_size() ) {

    // make sure all threads perform all loop iterations
    const iType bound = loop_boundaries.end + loop_boundaries.start ;
    const int lane = threadIdx.y * blockDim.x ;

    value_type accum = 0 ;
    value_type val, y, local_total;

    for( iType i = loop_boundaries.start; i < bound; i+=loop_boundaries.increment) {
      val = 0;
      if ( i < loop_boundaries.end ) closure(i,val,false);

      // intra-blockDim.y exclusive scan on 'val'
      // accum = accumulated, sum in total for this iteration

      // INCLUSIVE scan
      for( int offset = blockDim.x ; offset < Impl::CudaTraits::WarpSize ; offset <<= 1 ) {
        y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
        if(lane >= offset) { val += y; }
      }

      // pass accum to all threads
      local_total = shfl_warp_broadcast<value_type>(
         val,
         threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
         Impl::CudaTraits::WarpSize);

      // make EXCLUSIVE scan by shifting values over one
      val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
      if ( threadIdx.y == 0 ) { val = 0 ; }

      val += accum;
      if ( i < loop_boundaries.end ) closure(i,val,true);
      accum += local_total;
    }
  }
  else {
    value_type accum = 0 ;
    for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
      closure(i,accum,true);
    }
  }
}

// scan within team member (vector) within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Closure, class Scheduler >
KOKKOS_INLINE_FUNCTION
void parallel_scan
  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda, Scheduler > >& loop_boundaries,
   const Closure & closure )
{
  // Extract value_type from closure

  using value_type =
    typename Kokkos::Impl::FunctorAnalysis
      < Kokkos::Impl::FunctorPatternInterface::SCAN
      , void
      , Closure >::value_type ;

  if ( 1 < loop_boundaries.thread.team_size() ) {

    // make sure all threads perform all loop iterations
    const iType bound = loop_boundaries.end + loop_boundaries.start ;

    value_type accum = 0 ;
    value_type val, y, local_total;

    for( iType i = loop_boundaries.start; i < bound; i+=loop_boundaries.increment) {
      val = 0;
      if ( i < loop_boundaries.end ) closure(i,val,false);

      // intra-blockDim.x exclusive scan on 'val'
      // accum = accumulated, sum in total for this iteration

      // INCLUSIVE scan
      for( int offset = 1 ; offset < blockDim.x ; offset <<= 1 ) {
        y = Kokkos::shfl_up(val, offset, blockDim.x);
        if(threadIdx.x >= offset) { val += y; }
      }

      // pass accum to all threads
      local_total = shfl_warp_broadcast<value_type>(val, blockDim.x-1, blockDim.x);

      // make EXCLUSIVE scan by shifting values over one
      val = Kokkos::shfl_up(val, 1, blockDim.x);
      if ( threadIdx.x == 0 ) { val = 0 ; }

      val += accum;
      if ( i < loop_boundaries.end ) closure(i,val,true);
      accum += local_total;
    }
  }
  else {
    value_type accum = 0 ;
    for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
      closure(i,accum,true);
    }
  }
}

} /* namespace Kokkos */

namespace Kokkos {

  template<class FunctorType, class Scheduler>
  KOKKOS_INLINE_FUNCTION
  void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& , const FunctorType& lambda) {
#ifdef __CUDA_ARCH__
    if(threadIdx.x == 0) lambda();
#endif
  }
  
  template<class FunctorType, class Scheduler>
  KOKKOS_INLINE_FUNCTION
  void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& , const FunctorType& lambda) {
#ifdef __CUDA_ARCH__
    if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
#endif
  }
  
  template<class FunctorType, class ValueType, class Scheduler>
  KOKKOS_INLINE_FUNCTION
  void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& s , const FunctorType& lambda, ValueType& val) {
#ifdef __CUDA_ARCH__
    if(threadIdx.x == 0) lambda(val);
    if ( 1 < s.team_member.team_size() ) {
      val = shfl(val,0,blockDim.x);
    }
#endif
  }
  
  template<class FunctorType, class ValueType, class Scheduler>
  KOKKOS_INLINE_FUNCTION
  void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda, Scheduler > >& single_struct, const FunctorType& lambda, ValueType& val) {
#ifdef __CUDA_ARCH__
    if(threadIdx.x == 0 && threadIdx.y == 0) {
      lambda(val);
    }
    single_struct.team_member.team_broadcast(val,0);
#endif
  }

} // namespace Kokkos

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */

